from pathlib import Path
import re
import pandas as pd
from datetime import datetime
import asyncio
from playwright.async_api import async_playwright
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import hashlib
from collections import defaultdict
from urllib.parse import urlparse
import os

class Post:
    def __init__(self, url, title, category, csv_subdir=None, csv_filename=None):
        self.url = url
        self.title = title
        self.category = str(category).strip() if category else ""
        self.csv_subdir = str(csv_subdir).strip() if csv_subdir else ""
        self.csv_filename = str(csv_filename).strip() if csv_filename else ""
        self.safe_title = re.sub(r'[<>:"/\\|?*]', '_', str(title))
    
    def get_save_path(self, base_dir='outputs/hyplusite'):
        parts = [base_dir, self.category]
        if self.csv_subdir:
            parts.extend(str(part).strip() for part in self.csv_subdir.split('/') if str(part).strip())
        if self.csv_filename:
            parts.append(self.csv_filename)
        safe_parts = [re.sub(r'[<>:"/\\|?*]', '_', str(part)) for part in parts if part]
        path = Path(*safe_parts) / f"{self.safe_title}.html"
        return path

def download_and_replace_images(soup, save_path):
    img_dir = save_path.parent / 'images'
    img_dir.mkdir(parents=True, exist_ok=True)
    for img in soup.find_all('img'):
        src = img.get('src')
        if src and src.startswith('http'):
            try:
                img_data = requests.get(src, timeout=10).content
                urlp = urlparse(src)
                _, ext = os.path.splitext(urlp.path)
                if not ext or len(ext) > 5:
                    ext = '.jpg'
                hashname = hashlib.md5(src.encode()).hexdigest()[:10]
                local_img_name = f"{hashname}{ext}"
                local_img_path = img_dir / local_img_name
                with open(local_img_path, 'wb') as f:
                    f.write(img_data)
                img['src'] = f"images/{local_img_name}"
            except Exception as e:
                print(f"Failed to download image: {src}, error: {str(e)}")
                continue

def build_index_tree(posts, output_dir):
    def nested_dict():
        return defaultdict(nested_dict)
    tree = nested_dict()
    output_dir = Path(output_dir)
    for post in posts:
        html_path = post.get_save_path(output_dir)
        rel_path = str(html_path.relative_to(output_dir)).replace('\\', '/')
        current = tree[post.category]
        if post.csv_subdir:
            for part in post.csv_subdir.split('/'):
                if part.strip():
                    current = current[part.strip()]
        if post.csv_filename:
            current = current[post.csv_filename]
        if 'files' not in current:
            current['files'] = []
        current['files'].append((str(post.title), rel_path, str(post.url)))
    return tree

def write_index_html(tree, output_dir):
    def write_tree(node, indent=0):
        lines = []
        items = sorted(node.items(), key=lambda x: (not isinstance(x[1], defaultdict), x[0]))
        for name, content in items:
            if name == 'files':
                for title, path, url in sorted(content, key=lambda x: x[0]):
                    lines.append(f'{"  " * indent}<li><a href="{path}">{title}</a> - <a href="{url}" target="_blank">{url}</a></li>')
            else:
                lines.append(f'{"  " * indent}<li><strong>{name}</strong>\n{"  " * indent}<ul>')
                lines.extend(write_tree(content, indent + 1))
                lines.append(f'{"  " * indent}</ul></li>')
        return lines
    now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    lines = [
        '<!DOCTYPE html>',
        '<html lang="zh-CN">',
        '<head>',
        '<meta charset="UTF-8">',
        '<title>Hyplus Index - Hyplusite Exporter</title>',
        '<meta name="author" content="Akira37-hyperplasma">',
        f'<meta name="generated" content="{now_str}">',
        '<style>',
        'body{font-family:system-ui,-apple-system,sans-serif;line-height:1.4;max-width:800px;margin:30px auto;padding:0 20px;color:#24292e}',
        'ul{margin:0 0 0 1.5em;padding:0;}',
        'li{margin:.2em 0;}',
        'strong{color:#24292e;font-size:1.1em;}',
        'a{color:#0366d6;text-decoration:none;}',
        'a:hover{text-decoration:underline;}',
        '.meta{color:#666;font-size:0.9em;margin-bottom:20px;}',
        '</style>',
        '</head>',
        '<body>',
        '<h1>Hyplus Index</h1>',
        f'<p class="meta">Generated by Hyplusite Exporter on {now_str}.<br>Enjoy your reading experience at any time!</p>',
        '<ul>'
    ]
    lines.extend(write_tree(tree))
    lines.extend(['</ul>', '</body>', '</html>'])
    index_path = Path(output_dir) / "index.html"
    try:
        with open(index_path, "w", encoding="utf-8") as f:
            f.write('\n'.join(lines))
    except Exception as e:
        print(f"Error writing index.html: {str(e)}")

async def save_webpage_to_html_async(post, browser, output_dir='outputs/hyplusite', page_timeout=30000):
    save_path = post.get_save_path(output_dir)
    save_path.parent.mkdir(parents=True, exist_ok=True)
    if save_path.exists():
        return f"File already exists, skipped: {str(save_path)}"
    try:
        context = await browser.new_context(
            viewport={'width': 1280, 'height': 720},
            java_script_enabled=True,
            bypass_csp=True
        )
        page = await context.new_page()
        page.set_default_timeout(page_timeout)
        response = await page.goto(post.url, wait_until='domcontentloaded')
        if response and response.status == 503:
            await context.close()
            msg = f"503 Service Unavailable detected for {str(post.url)}. This may be due to rate limiting or server protection."
            log_error(post, msg)
            return f"Download failed (503): {str(post.url)}"
        try:
            await page.wait_for_selector('article', timeout=5000)
        except:
            pass
        html_content = await page.content()
        soup = BeautifulSoup(html_content, 'html.parser')
        download_and_replace_images(soup, save_path)
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        await context.close()
        return f"Success: {str(post.title)}"
    except Exception as e:
        error_msg = f"Download failed {str(post.url)}: {str(e)}"
        log_error(post, str(e))
        return error_msg

def log_error(post, error_msg):
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    log_dir = Path('logs')
    log_dir.mkdir(exist_ok=True)
    log_path = log_dir / 'error_log.txt'
    title = str(post.title) if hasattr(post, 'title') else 'Unknown'
    url = str(post.url) if hasattr(post, 'url') else 'Unknown'
    error_msg = str(error_msg)
    with log_path.open('a', encoding='utf-8') as f:
        f.write(f"[{timestamp}] {title} ({url}): {error_msg}\n")

def parse_posts_file(data_dir='data'):
    posts = []
    data_dir = Path(data_dir)
    if not data_dir.exists():
        print(f"Data directory {data_dir} does not exist.")
        return posts
    for category_dir in data_dir.iterdir():
        if not category_dir.is_dir():
            continue
        category = category_dir.name
        for csv_file in category_dir.rglob("*.csv"):
            rel_path = csv_file.relative_to(category_dir).parent
            csv_subdir = str(rel_path).replace('\\', '/') if str(rel_path) != '.' else ''
            csv_filename = csv_file.stem  # without .csv
            try:
                df = pd.read_csv(csv_file)
                for _, row in df.iterrows():
                    posts.append(Post(
                        url=row['url'],
                        title=row['title'],
                        category=category,
                        csv_subdir=csv_subdir,
                        csv_filename=csv_filename
                    ))
            except Exception as e:
                print(f"Error parsing CSV file {str(csv_file)}: {str(e)}")
                continue
    return posts

async def download_batch(posts, semaphore, browser, output_dir, page_timeout):
    async with semaphore:
        tasks = []
        for post in posts:
            task = asyncio.create_task(
                save_webpage_to_html_async(
                    post, 
                    browser, 
                    output_dir=output_dir,
                    page_timeout=page_timeout
                )
            )
            tasks.append(task)
        return await asyncio.gather(*tasks)

async def download_webpages_async(
    data_dir='data',
    concurrent_downloads=5,
    batch_size=10,
    page_timeout=30000,
    output_dir='outputs/hyplusite'
):
    posts = parse_posts_file(data_dir)
    total = len(posts)
    if total == 0:
        print("No pages found to download.")
        return
    print(f"Found {total} pages to download.")
    log_dir = Path('logs')
    log_dir.mkdir(exist_ok=True)
    progress_file = log_dir / 'download_progress.txt'
    if progress_file.exists():
        last_index = int(progress_file.read_text())
        posts = posts[last_index:]
    else:
        last_index = 0
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--no-sandbox',
                    '--disable-setuid-sandbox',
                ]
            )
            semaphore = asyncio.Semaphore(concurrent_downloads)
            with tqdm(total=len(posts), desc="Download progress") as pbar:
                for i in range(0, len(posts), batch_size):
                    batch = posts[i:i + batch_size]
                    results = await download_batch(
                        batch, 
                        semaphore, 
                        browser, 
                        output_dir,
                        page_timeout
                    )
                    progress_file.write_text(str(last_index + i + len(batch)))
                    pbar.update(len(batch))
                    for result in results:
                        if "503" in result or result.startswith("Download failed"):
                            print(f"\n{result}")
            await browser.close()
        tree = build_index_tree(parse_posts_file(data_dir), output_dir)
        write_index_html(tree, output_dir)
        print(f"\nIndex page generated at: {str(Path(output_dir) / 'index.html')}")
    except KeyboardInterrupt:
        print("\nUser interrupted download")
    except Exception as e:
        print(f"\nError occurred: {str(e)}")

async def download_single_page(url, output_dir='outputs/hyplusite', page_timeout=30000):
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                headless=True,
                args=[
                    '--disable-gpu',
                    '--disable-dev-shm-usage',
                    '--no-sandbox',
                    '--disable-setuid-sandbox',
                ]
            )
            post = Post(url=url, title='hyplus', category='single_pages')
            result = await save_webpage_to_html_async(
                post,
                browser,
                output_dir=output_dir,
                page_timeout=page_timeout
            )
            print(result)
            await browser.close()
    except Exception as e:
        print(f"Download failed: {str(e)}")

if __name__ == "__main__":
    url = "https://www.hyperplasma.top/hyplus"
    asyncio.run(download_single_page(url))